Loading in necessary libraries

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

The TCGA MAF summary file

maf_file <- "/media/theron/My_Passport/TCGA_junctions/maf_summary.txt"
mc3_maf = read.table(maf_file,header=T)
mc3_maf$Tumor_Sample_ID <- vapply(TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,sample=T),
                                  function(val){substr(val,1,nchar(val)-1)},
                                  character(1))
rownames(mc3_maf) <- mc3_maf$Tumor_Sample_Barcode
mc3_maf$participant_ID <- TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,participant=T)

Accumulating mutation data per sample only annotated

junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
# TMB<-list()

cluster_metrics_tum <- data.frame(cancers)

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)

  tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
  tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
  tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
  tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
  mc3_maf_small<-subset(mc3_maf,participant_ID %in% tumor_meta$participant_ID)
  mc3_maf_small <- mc3_maf_small[complete.cases(mc3_maf_small),]
  mc3_maf_small$type <- vapply(rownames(mc3_maf_small),function(barcode){
    type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
    if (type <= 9){
      return("T")
    } else if (type > 9 & type <= 19){
      return ("N")
    } else {
      return ("C")
    }
  },character(1))
  mc3_maf_small$TMB<-log10(mc3_maf_small$total+1)
  mc3_maf_small <- mc3_maf_small %>% dplyr::filter(type == "T")

  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  splice_mut_file <- sprintf("%s/%s_splice_dat_clusters_filt_ann.rds",tumor_dir,cancer)
  splice_mut_data <- readRDS(splice_mut_file)
  colnames(splice_mut_data) <- vapply(colnames(splice_mut_data),function(col_name){
    col_name<-str_replace(col_name,"X","")
    col_name <- str_replace_all(col_name,"[.]","-")
    tumor_geno$sample_id[which(tumor_geno$external_id == col_name)[1]]
  },character(1))
  
  mc3_maf_small <- mc3_maf_small %>% dplyr::filter(Tumor_Sample_ID %in% colnames(splice_mut_data))
  splice_mut_data<-splice_mut_data[,mc3_maf_small$Tumor_Sample_ID]
  splice_mut_per_sample<-data.frame(colnames(splice_mut_data))
  splice_mut_per_sample$av <- apply(splice_mut_data,2,mean)
  splice_mut_per_sample$med <- apply(splice_mut_data,2,median)
  splice_mut_per_sample$TMB <- mc3_maf_small$TMB
  splice_mut_per_sample$cancer <- cancer
  colnames(splice_mut_per_sample) <- c("sample","splice_mut_av","splice_mut_med","TMB","cancer")
  
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm_med"] <- median(splice_mut_per_sample$splice_mut_med[splice_mut_per_sample$splice_mut_med>0])
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm_av"] <- mean(splice_mut_per_sample$splice_mut_av)
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"TMB"] <- median(splice_mut_per_sample$TMB)
  if (i == 1){
    splice_mut_per_sample_all <- splice_mut_per_sample
  } else {
    splice_mut_per_sample_all <- rbind(splice_mut_per_sample_all,splice_mut_per_sample)
  }
  
  splice_mut_data_mat<-as.matrix(log10(splice_mut_data+1))
  colnames(splice_mut_data_mat)<-colnames(splice_mut_data)
  
  print(Heatmap(splice_mut_data_mat,
          top_annotation = HeatmapAnnotation(TMB=anno_barplot(mc3_maf_small$TMB)),
          show_row_names=F,
          show_column_names = F,
          cluster_rows=T,
          cluster_columns=T))
  
  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_av,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
  
  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_med,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
  
  splice_mut_per_sample <- splice_mut_per_sample %>% dplyr::filter(splice_mut_med>0)
  
  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_med,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
}
## [1] "1 out of 14"
## [1] "BLCA"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "2 out of 14"
## [1] "BRCA"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## [1] "3 out of 14"
## [1] "CHOL"

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "4 out of 14"
## [1] "COAD"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "5 out of 14"
## [1] "HNSC"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "6 out of 14"
## [1] "KICH"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "7 out of 14"
## [1] "KIRP"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "8 out of 14"
## [1] "LIHC"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "9 out of 14"
## [1] "LUAD"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "10 out of 14"
## [1] "LUSC"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "11 out of 14"
## [1] "PRAD"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "12 out of 14"
## [1] "READ"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "13 out of 14"
## [1] "THCA"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "14 out of 14"
## [1] "UCEC"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

ggplot(cluster_metrics_tum,aes(x=log10(splice_imm_med+1),y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  xlab("splicing antigenicity median")

ggplot(cluster_metrics_tum,aes(x=log10(splice_imm_av+1),y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  xlab("splicing antigenicity average")

ggplot(splice_mut_per_sample_all,aes(x=splice_mut_av,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs("All Samples")
## `geom_smooth()` using formula 'y ~ x'

ggplot(splice_mut_per_sample_all,aes(x=splice_mut_med,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs("All Samples")
## `geom_smooth()` using formula 'y ~ x'

splice_mut_per_sample_all_filt <- splice_mut_per_sample_all %>% dplyr::filter(splice_mut_med>0)

ggplot(splice_mut_per_sample_all_filt,aes(x=splice_mut_med,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs("All Samples, No Zero Median samples")
## `geom_smooth()` using formula 'y ~ x'

Accumulating mutation data per sample

junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
# TMB<-list()

cluster_metrics_tum <- data.frame(cancers)

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)

  tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
  tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
  tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
  tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
  mc3_maf_small<-subset(mc3_maf,participant_ID %in% tumor_meta$participant_ID)
  mc3_maf_small <- mc3_maf_small[complete.cases(mc3_maf_small),]
  mc3_maf_small$type <- vapply(rownames(mc3_maf_small),function(barcode){
    type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
    if (type <= 9){
      return("T")
    } else if (type > 9 & type <= 19){
      return ("N")
    } else {
      return ("C")
    }
  },character(1))
  mc3_maf_small$TMB<-log10(mc3_maf_small$total+1)
  mc3_maf_small <- mc3_maf_small %>% dplyr::filter(type == "T")

  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  splice_mut_file <- sprintf("%s/%s_splice_dat_clusters_filt.rds",tumor_dir,cancer)
  splice_mut_data <- readRDS(splice_mut_file)
  colnames(splice_mut_data) <- vapply(colnames(splice_mut_data),function(col_name){
    col_name<-str_replace(col_name,"X","")
    col_name <- str_replace_all(col_name,"[.]","-")
    tumor_geno$sample_id[which(tumor_geno$external_id == col_name)[1]]
  },character(1))
  
  mc3_maf_small <- mc3_maf_small %>% dplyr::filter(Tumor_Sample_ID %in% colnames(splice_mut_data))
  splice_mut_data<-splice_mut_data[,mc3_maf_small$Tumor_Sample_ID]
  splice_mut_per_sample<-data.frame(colnames(splice_mut_data))
  splice_mut_per_sample$av <- apply(splice_mut_data,2,mean)
  splice_mut_per_sample$med <- apply(splice_mut_data,2,median)
  splice_mut_per_sample$TMB <- mc3_maf_small$TMB  
  splice_mut_per_sample$cancer <- cancer

  colnames(splice_mut_per_sample) <- c("sample","splice_mut_av","splice_mut_med","TMB","cancer")
  
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm_med"] <- median(splice_mut_per_sample$splice_mut_med[splice_mut_per_sample$splice_mut_med>0])
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm_av"] <- mean(splice_mut_per_sample$splice_mut_av)
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"TMB"] <- median(splice_mut_per_sample$TMB)
  if (i == 1){
    splice_mut_per_sample_all <- splice_mut_per_sample
  } else {
    splice_mut_per_sample_all <- rbind(splice_mut_per_sample_all,splice_mut_per_sample)
  }
  
  splice_mut_data_mat<-as.matrix(log10(splice_mut_data+1))
  colnames(splice_mut_data_mat)<-colnames(splice_mut_data)
  
  print(Heatmap(splice_mut_data_mat,
          top_annotation = HeatmapAnnotation(TMB=anno_barplot(mc3_maf_small$TMB)),
          show_row_names=F,
          show_column_names = F,
          cluster_rows=T,
          cluster_columns=T))

  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_av,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))

  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_med,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
  
  splice_mut_per_sample <- splice_mut_per_sample %>% dplyr::filter(splice_mut_med>0)
  
  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_med,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
}
## [1] "1 out of 14"
## [1] "BLCA"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "2 out of 14"
## [1] "BRCA"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## [1] "3 out of 14"
## [1] "CHOL"

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "4 out of 14"
## [1] "COAD"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "5 out of 14"
## [1] "HNSC"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "6 out of 14"
## [1] "KICH"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "7 out of 14"
## [1] "KIRP"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "8 out of 14"
## [1] "LIHC"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "9 out of 14"
## [1] "LUAD"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "10 out of 14"
## [1] "LUSC"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "11 out of 14"
## [1] "PRAD"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "12 out of 14"
## [1] "READ"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "13 out of 14"
## [1] "THCA"
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## [1] "14 out of 14"
## [1] "UCEC"
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

ggplot(cluster_metrics_tum,aes(x=splice_imm_med,y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  xlab("splicing antigenicity median")

ggplot(cluster_metrics_tum,aes(x=splice_imm_av,y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  xlab("splicing antigenicity average")

ggplot(splice_mut_per_sample_all,aes(x=splice_mut_av,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs("All Samples")
## `geom_smooth()` using formula 'y ~ x'

ggplot(splice_mut_per_sample_all,aes(x=splice_mut_med,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs(sprintf("All Samples: %d",nrow(splice_mut_per_sample_all)))
## `geom_smooth()` using formula 'y ~ x'

splice_mut_per_sample_all_filt <- splice_mut_per_sample_all %>% dplyr::filter(splice_mut_med>0)

ggplot(splice_mut_per_sample_all_filt,aes(x=splice_mut_med,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs(sprintf("No Zero Median samples: %d",nrow(splice_mut_per_sample_all_filt)))
## `geom_smooth()` using formula 'y ~ x'

Looking at cibersort data per sample

cibersort_file <- "/media/theron/My_Passport/TCGA_junctions/ext_dat/TCGA.Kallisto.fullIDs.cibersort.relative.tsv"
cibersort_data<-read.table(cibersort_file,header=T)
cibersort_data$sample <- vapply(TCGAbarcode(str_replace_all(cibersort_data$SampleID,"[.]","-"),sample=T),
                                function(sample){substr(sample,1,nchar(sample)-1)},character(1))
rownames(splice_mut_per_sample_all)<-splice_mut_per_sample_all$sample
cibersort_data_filt <- cibersort_data %>% dplyr::filter(sample %in% splice_mut_per_sample_all$sample)
cibersort_append<-lapply(cibersort_data_filt$sample,function(ID){
  a<-which(splice_mut_per_sample_all$sample == ID)
  splice_data_TMB <- as.numeric(splice_mut_per_sample_all[a,c("splice_mut_av","splice_mut_med","TMB")])
})
cancers<-vapply(cibersort_data_filt$sample,function(ID){
  a<-which(splice_mut_per_sample_all$sample == ID)
  cancer <- splice_mut_per_sample_all[a,"cancer"]
},character(1))

cibersort_append <- data.frame(matrix(unlist(cibersort_append),byrow=T,nrow=nrow(cibersort_data_filt)))
cibersort_append$cancer <- cancers
colnames(cibersort_append)<-c("splice_mut_av","splice_mut_med","TMB","cancer")
cibersort_data_filt <- cbind(cibersort_data_filt,cibersort_append)

Naive B-cells

ggplot(cibersort_data_filt,aes(x=TMB,y=B.cells.naive))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=B.cells.naive))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=B.cells.naive))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=B.cells.memory))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=B.cells.memory))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=B.cells.memory))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Plasma.cells))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Plasma.cells))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Plasma.cells))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=T.cells.CD8))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=T.cells.CD8))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=T.cells.CD8))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=T.cells.CD4.naive))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=T.cells.CD4.naive))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=T.cells.CD4.naive))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=T.cells.CD4.memory.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=T.cells.CD4.memory.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=T.cells.CD4.memory.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=T.cells.CD4.memory.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=T.cells.CD4.memory.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=T.cells.CD4.memory.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=T.cells.follicular.helper))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=T.cells.follicular.helper))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=T.cells.follicular.helper))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=T.cells.regulatory..Tregs.))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=T.cells.regulatory..Tregs.))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=T.cells.regulatory..Tregs.))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=T.cells.gamma.delta))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=T.cells.gamma.delta))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=T.cells.gamma.delta))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=NK.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=NK.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=NK.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=NK.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=NK.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=NK.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Monocytes))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Monocytes))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Monocytes))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Macrophages.M0))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Macrophages.M0))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Macrophages.M0))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Macrophages.M1))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Macrophages.M1))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Macrophages.M1))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Macrophages.M2))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Macrophages.M2))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Macrophages.M2))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Dendritic.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Dendritic.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Dendritic.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Dendritic.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Dendritic.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Dendritic.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Mast.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Mast.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Mast.cells.resting))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Mast.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Mast.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Mast.cells.activated))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Eosinophils))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Eosinophils))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Eosinophils))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=TMB,y=Neutrophils))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_av,y=Neutrophils))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(cibersort_data_filt,aes(x=splice_mut_med,y=Neutrophils))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

Analyzing Splicing Factor Mutations per cancer type

splicing_factor_genes<-read_excel("/media/theron/My_Passport/TCGA_junctions/ext_dat/splicing_factor_genes1.xlsx")
splicing_factor_genes<-toupper(splicing_factor_genes$Gene)
write.table(data.frame(splicing_factor_genes),
            file="/media/theron/My_Passport/TCGA_junctions/ext_dat/splicing_factor_genes.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
splice_maf<-read.maf("/media/theron/My_Passport/TCGA_junctions/splice_factor.maf")
splice_maf_samp<-getSampleSummary(splice_maf)
splice_maf_samp$Tumor_Sample_ID <- TCGAbarcode(as.character(splice_maf_samp$Tumor_Sample_Barcode),sample=T)
splice_maf_samp$Tumor_Sample_ID <- vapply(splice_maf_samp$Tumor_Sample_ID,function(samp){
  substr(samp,1,nchar(samp)-1)
},character(1))
splice_maf_samp$participant_ID <- TCGAbarcode(as.character(splice_maf_samp$Tumor_Sample_Barcode),participant=T)
rownames(splice_maf_samp)<-splice_maf_samp$Tumor_Sample_Barcode
splice_maf_samp<-data.frame(splice_maf_samp)

Accumulating splicing factor mutation data per sample

mc3_maf <- splice_maf_samp
rownames(mc3_maf) <- mc3_maf$Tumor_Sample_Barcode

junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
# TMB<-list()

cluster_metrics_tum <- data.frame(cancers)

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)

  tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
  tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
  tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
  tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
  mc3_maf_small<-subset(mc3_maf,participant_ID %in% tumor_meta$participant_ID)
  mc3_maf_small <- mc3_maf_small[complete.cases(mc3_maf_small),]
  mc3_maf_small$type <- vapply(rownames(mc3_maf_small),function(barcode){
    type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
    if (type <= 9){
      return("T")
    } else if (type > 9 & type <= 19){
      return ("N")
    } else {
      return ("C")
    }
  },character(1))
  mc3_maf_small$TMB<-log10(mc3_maf_small$total+1)
  mc3_maf_small <- mc3_maf_small %>% dplyr::filter(type == "T")

  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  splice_mut_file <- sprintf("%s/%s_splice_dat_clusters_filt.rds",tumor_dir,cancer)
  splice_mut_data <- readRDS(splice_mut_file)
  colnames(splice_mut_data) <- vapply(colnames(splice_mut_data),function(col_name){
    col_name<-str_replace(col_name,"X","")
    col_name <- str_replace_all(col_name,"[.]","-")
    tumor_geno$sample_id[which(tumor_geno$external_id == col_name)[1]]
  },character(1))
  
  mc3_maf_small <- mc3_maf_small %>% dplyr::filter(Tumor_Sample_ID %in% colnames(splice_mut_data))
  splice_mut_data<-splice_mut_data[,mc3_maf_small$Tumor_Sample_ID]
  splice_mut_per_sample<-data.frame(colnames(splice_mut_data))
  splice_mut_per_sample$av <- apply(splice_mut_data,2,mean)
  splice_mut_per_sample$med <- apply(splice_mut_data,2,median)
  splice_mut_per_sample$TMB <- mc3_maf_small$TMB  
  
  # print(Heatmap(splice_mut_data_mat,
  #         top_annotation = HeatmapAnnotation(TMB=anno_barplot(mc3_maf_small$TMB)),
  #         show_row_names=F,
  #         show_column_names = F,
  #         cluster_rows=T,
  #         cluster_columns=T))

  colnames(splice_mut_per_sample) <- c("sample","splice_mut_av","splice_mut_med","TMB")
  
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm_med"] <- median(splice_mut_per_sample$splice_mut_med[splice_mut_per_sample$splice_mut_med>0])
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm_av"] <- mean(splice_mut_per_sample$splice_mut_av)
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"TMB"] <- median(splice_mut_per_sample$TMB)
  if (i == 1){
    splice_mut_per_sample_all <- splice_mut_per_sample
  } else {
    splice_mut_per_sample_all <- rbind(splice_mut_per_sample_all,splice_mut_per_sample)
  }
  
  splice_mut_data_mat<-as.matrix(log10(splice_mut_data+1))
  colnames(splice_mut_data_mat)<-colnames(splice_mut_data)
  
  # print(Heatmap(splice_mut_data_mat,
  #         top_annotation = HeatmapAnnotation(TMB=anno_barplot(mc3_maf_small$TMB)),
  #         show_row_names=F,
  #         show_column_names = F,
  #         cluster_rows=T,
  #         cluster_columns=T))

  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_av,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
  
  splice_mut_per_sample <- splice_mut_per_sample %>% dplyr::filter(splice_mut_med>0)
  
  print(ggplot(splice_mut_per_sample,aes(x=splice_mut_med,y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
}

ggplot(cluster_metrics_tum,aes(x=log10(splice_imm_med+1),y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  xlab("splicing antigenicity median")

ggplot(cluster_metrics_tum,aes(x=log10(splice_imm_av+1),y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  xlab("splicing antigenicity average")

ggplot(splice_mut_per_sample_all,aes(x=splice_mut_av,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs("All Samples")

ggplot(splice_mut_per_sample_all,aes(x=splice_mut_med,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs("All Samples")

splice_mut_per_sample_all <- splice_mut_per_sample_all %>% dplyr::filter(splice_mut_med>0)

ggplot(splice_mut_per_sample_all,aes(x=splice_mut_med,y=TMB))+
  geom_point()+
  stat_cor(method = "spearman",
           label.x.npc = "center",
           label.y.npc = "top")+
  geom_smooth(method="lm")+
    labs("All Samples, No Zero Median samples")

splice maf heatmaps

splice_maf_data <- splice_maf@data
splice_maf_data <- splice_maf_data %>% dplyr::filter(Hugo_Symbol %in% splicing_factor_genes)
splice_maf_data <- splice_maf_data[,c("Hugo_Symbol","Variant_Classification","Tumor_Sample_Barcode")]
splice_maf_data$Tumor_Sample_ID <- vapply(TCGAbarcode(as.character(splice_maf_data$Tumor_Sample_Barcode),sample=T),function(val){substr(val,1,nchar(val)-1)},character(1))
splice_maf_data$participant_ID <- TCGAbarcode(as.character(splice_maf_data$Tumor_Sample_Barcode),participant=T)
splice_maf_data$cancer <- NA
splice_maf_data_fill <- splice_maf_data
splice_maf_data_fill$cancer <- as.character(splice_maf_data_fill$cancer)
splice_maf_data_fill[,c("splice_ant_av","splice_ant_med")] <- data.frame(t(vapply(splice_maf_data_fill$Tumor_Sample_ID,function(ID){
  a<-which(splice_mut_per_sample_all$sample == ID)
  if (length(a)==0){
    return(c(0,0))
  } else {
    return(as.numeric(splice_mut_per_sample_all[a,c("splice_mut_av","splice_mut_med")]))
  }
},numeric(2))))

splice_maf_data_fill$cancer <- vapply(splice_maf_data_fill$Tumor_Sample_ID,function(ID){
  a<-which(splice_mut_per_sample_all$sample == ID)
  if (length(a)==0){
    return("None")
  } else {
    return(splice_mut_per_sample_all[a,"cancer"])
  }
},character(1))


junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
# TMB<-list()

splice_maf_data_fill <- splice_maf_data_fill %>% dplyr::filter(cancer != "None")

mutation heatmap

sample_cancer_dat <- unique(splice_maf_data_fill[,c("Tumor_Sample_ID","cancer","splice_ant_av","splice_ant_med")])
a<-lapply(sample_cancer_dat$Tumor_Sample_ID,function(ID){
  splice_maf_data_fill_small <- splice_maf_data_fill %>% dplyr::filter(Tumor_Sample_ID == ID)
  vapply(splicing_factor_genes,function(gene){
    count <- length(which(splice_maf_data_fill_small$Hugo_Symbol == gene))
  },numeric(1))
})
sample_splice_factor_dat <- data.frame(matrix(unlist(a),nrow=nrow(sample_cancer_dat),byrow=T))
rownames(sample_splice_factor_dat) <- sample_cancer_dat$Tumor_Sample_ID
colnames(sample_splice_factor_dat) <- splicing_factor_genes

sample_splice_factor_dat_muts <- sample_splice_factor_dat[which(apply(sample_splice_factor_dat,1,sd) > 0),]
sample_cancer_dat_muts <- sample_cancer_dat[which(apply(sample_splice_factor_dat,1,sd) > 0),]
cancer_order <- order(sample_cancer_dat_muts$cancer)
sample_splice_factor_dat_muts <- sample_splice_factor_dat_muts[cancer_order,]
sample_cancer_dat_muts <- sample_cancer_dat_muts[cancer_order,]

Heatmap(log2(sample_splice_factor_dat_muts+1),
        right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant_av)),
        left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
        show_row_names=F,
        show_column_names = F,
        cluster_rows=T,
        cluster_columns=T)
Heatmap(t(scale(t(sample_splice_factor_dat_muts))),
        right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant_med)),
        left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
        show_row_names=F,
        show_column_names = F,
        cluster_rows=T,
        cluster_columns=T)

Heatmap(log2(sample_splice_factor_dat_muts+1),
        right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant_av)),
        left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
        show_row_names=F,
        show_column_names = F,
        cluster_rows=F,
        cluster_columns=T)

Heatmap(t(scale(t(sample_splice_factor_dat_muts))),
        right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant_med)),
        left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
        show_row_names=F,
        show_column_names = F,
        cluster_rows=F,
        cluster_columns=T)

sample_splice_factor_dat_muts summarized

sample_cancer_dat_muts$sum <- apply(sample_splice_factor_dat_muts,1,sum)
for (i in unique(sample_cancer_dat_muts$cancer)){
  specific_cancer <- sample_cancer_dat_muts %>% dplyr::filter(cancer == i)
  print(ggplot(specific_cancer,aes(x=log10(splice_ant+1),y=log2(sum+1)))+geom_point()+labs(title=i))
}


ggplot(sample_cancer_dat_muts,aes(x=cancer,y=log2(sum+1)))+geom_boxplot()
ggplot(sample_cancer_dat_muts,aes(x=cancer,y=log10(splice_ant+1)))+geom_boxplot()

ggplot(sample_cancer_dat_muts,aes(x=log10(splice_ant+1),y=log10(sum+1)))+
  geom_point()+
  stat_cor(method = "spearman")+
    geom_smooth(method="lm")

Determining per gene metrics for each cancer type

split_num <- function(vals){
  a<-data.frame(matrix(as.numeric(unlist(str_split(vals,"/"))),byrow=T,nrow=length(vals)))[,1]
  return(a)
}
split_dom <- function(vals){
  a<-data.frame(matrix(as.numeric(unlist(str_split(vals,"/"))),byrow=T,nrow=length(vals)))[,2]
  return(a)
}

eval_clusters <- function(vals){
  
}
  
# splicemutr data needed per cancer type
# per-line counts needed per cancer type
# only keep those proteins that are longer than 9 kmers

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}
  
  splice_dat_file <- sprintf("%s/%s_splicemutr_dat.txt",tumor_dir,cancer)
  splice_dat <- read.table(splice_dat_file,header=T,sep="\t")
  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  # tumor_geno <- tumor_geno %>% dplyr::filter(type == "T")
  summary_file <- sprintf("%s/summaries.txt",tumor_dir)
  summaries <- read.table(summary_file)
  summaries <- summaries$V1
  summaries<-unname(vapply(summaries,function(summ){
    str_replace(summ,"kmers_summary","persamp_line")
  },character(1)))
  meta_file <- sprintf("%s/%s_metadata.rds",tumor_dir,cancer)
  meta_dat <- readRDS(meta_file)
  rownames(meta_dat) <- meta_dat$external_id
  psi_file <- sprintf("%s/data_perind.counts",tumor_dir)
  psi_dat <- read.table(psi_file,header=T,check.names=F)
  psi_dat <- psi_dat[,c("chrom",tumor_geno$external_id)]
  sample_names <- colnames(psi_dat)[seq(2,ncol(psi_dat))]
  sample_names <- meta_dat[sample_names,"tcga.tcga_barcode"]
  colnames(psi_dat)[seq(2,ncol(psi_dat))] <- sample_names

  for (summ in seq(length(summaries))){
    if (summ == 1){
      summaries_combined <- read.table(summaries[summ],header=F,sep="\t")
      if (length(summaries)>1){
        summaries_combined <- summaries_combined[,seq(ncol(summaries_combined)-2)]
      }
    } else if (summ == length(summaries)){
      summaries_fill <- read.table(summaries[summ],header=F,sep="\t")
      summaries_combined <- cbind(summaries_combined,summaries_fill)
    } else {
      summaries_fill <- read.table(summaries[summ],header=F,sep="\t")
      summaries_fill <- summaries_fill[,seq(ncol(summaries_fill)-2)]
      summaries_combined <- cbind(summaries_combined,summaries_fill)
    }
  }
  
  sample_types <- sprintf("%s_%s",tumor_geno$sample_id,tumor_geno$type)
  sample_types<-c(sample_types,"row","cluster")
  colnames(summaries_combined)<-sample_types
  tumor_cols <- which(str_detect(sample_types,"_T"))
  summaries_combined <- summaries_combined[,c(tumor_cols,length(sample_types)-1,length(sample_types))]

  num <- data.frame(apply(psi_dat[,seq(2,ncol(psi_dat))],2,split_num))
  denom <- data.frame(apply(psi_dat[,seq(2,ncol(psi_dat))],2,split_dom))
  psi <- num/denom
  is.nan.data.frame <- function(x){do.call(cbind, lapply(x, is.nan))}
  psi[is.nan(psi)]<-0
  psi$chrom <- psi_dat$chrom
  colnames(psi)[seq(1,ncol(psi)-1)] <- colnames(psi_dat)[seq(2,ncol(psi_dat))]
  tumor_cols <- which(tumor_geno$type == "T")
  psi <- psi[,c(tumor_cols,ncol(psi))]
  psi_summary<-psi[summaries_combined$row+1,]
  
  summaries_combined_psi <- psi_summary[,seq(1,ncol(psi_summary)-1)]*summaries_combined[,seq(1,ncol(summaries_combined)-2)]
  
  splice_dat_specific <- splice_dat[summaries_combined$row+1,]
  rows_to_keep <- which(!is.na(splice_dat_specific$peptide))
  
  summaries_combined_psi <- summaries_combined_psi[rows_to_keep,]
  summaries_combined_psi[is.na(summaries_combined_psi)]<-0
  splice_dat_specific <- splice_dat_specific[rows_to_keep,]
  rows_to_keep <- !(splice_dat_specific$verdict == "annotated" & splice_dat_specific$modified == "changed")
  summaries_combined_psi <- summaries_combined_psi[rows_to_keep,]
  summaries_combined <- summaries_combined[rows_to_keep,]
  splice_dat_specific <- splice_dat_specific[rows_to_keep,]
  rows_to_keep <- which(splice_dat_specific$deltapsi > 0)
  summaries_combined_psi <- summaries_combined_psi[rows_to_keep,]
  summaries_combined <- summaries_combined[rows_to_keep,]
  splice_dat_specific <- splice_dat_specific[rows_to_keep,]
  
  clusters <- data.frame(table(splice_dat_specific$cluster))
  rownames(clusters)<-clusters$Var1
  clusters$Var1<-as.character(clusters$Var1)
  
  clusters$genes <- vapply(clusters$Var1,function(clu){
    splice_dat_small <- splice_dat_specific %>% dplyr::filter(cluster == clu)
    gene<-paste(unique(splice_dat_small$gene),collapse=":")
  },character(1))
    
  splice_dat_clusters <- data.frame(t(vapply(clusters$Var1,function(clu){
    cluster_rows <- which(splice_dat_specific$cluster == clu)
    summaries_combined_small <- summaries_combined_psi[cluster_rows,]
    apply(summaries_combined_small,2,sum)/clusters[clu,"Freq"]
  },numeric(ncol(summaries_combined_psi)))))
  
  saveRDS(splice_dat_clusters,file=sprintf("%s/%s_splice_dat_clusters.rds",tumor_dir,cancer))
  saveRDS(clusters,file=sprintf("%s/%s_clusters.rds",tumor_dir,cancer))
  
  rm(clusters,
     splice_dat_clusters,
     splice_dat_specific,
     rows_to_keep,
     summaries_combined,
     summaries_combined_psi,
     psi_summary,psi,
     tumor_cols,
     sample_types,
     summaries_fill,
     psi_dat,
     psi_file,
     meta_dat,
     meta_file,
     summaries,
     summary_file,
     tumor_geno,
     tumor_geno_file,
     splice_dat,
     splice_dat_file,
     cancer,
     tumor_dir)
}

Compiling per-gene metrics

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")

genes <- c()
for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}
  
  
  per_gene_data<-readRDS(file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
  genes <- unique(c(genes,per_gene_data$genes))
  
}

per_gene_data_tot_binders <- data.frame(genes)
per_gene_data_tot_prop <- data.frame(genes)

rownames(per_gene_data_tot_binders) <- per_gene_data_tot_binders$genes
rownames(per_gene_data_tot_prop) <- per_gene_data_tot_prop$genes

cancers <- c()
for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}
  cancers <- c(cancers,cancer)
  
  per_gene_data<-readRDS(file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
  rownames(per_gene_data)<-per_gene_data$genes
  per_gene_data_tot_binders[,cancer]<- -1
  per_gene_data_tot_prop[,cancer]<- -1
  per_gene_data_tot_binders[per_gene_data$genes,cancer] <- per_gene_data$median_binders
  per_gene_data_tot_prop[per_gene_data$genes,cancer] <- per_gene_data$ann_prop
  
}
row_ann <- unname(vapply(genes,function(gene){
  if (str_detect(gene,"-")){
    return("Fusion")
  } else {
    return("Single")
  }
},character(1)))
row_ann <- data.frame(row_ann)
rownames(row_ann)<-genes

per_gene_data_tot_binders <- per_gene_data_tot_binders[,seq(2,ncol(per_gene_data_tot_binders))]
per_gene_data_tot_prop <- per_gene_data_tot_prop[,seq(2,ncol(per_gene_data_tot_prop))]

per_gene_comp <- per_gene_data_tot_binders*per_gene_data_tot_prop

Heatmap(log10(per_gene_comp+1),
        right_annotation = rowAnnotation(df=row_ann),
        show_row_names=F,
        show_column_names = T,
        cluster_rows=T,
        cluster_columns=T)